scRNA-seq data is highly dimensional as every analyzed genes corresponds to a new dimension. Thus, dimension reduction methods are necassary to enable the work with the data. This allows not only to visualize the data, but is also the basis for clustering the data.
Loading the necessary libraries
import scanpy as sc
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import sklearn
import plotly.express as px
from scipy.sparse import csr_matrix
# settings can be adapted individually
sc.settings.verbosity = 3
sc.logging.print_header()
sc.settings.set_figure_params(dpi = 100, format = 'png')
scanpy==1.7.2 anndata==0.7.6 umap==0.4.6 numpy==1.20.1 scipy==1.6.2 pandas==1.2.4 scikit-learn==0.24.2 statsmodels==0.12.2 python-igraph==0.9.1 louvain==0.7.0
Set random seed
rng_seed = 2021
rng = np.random.RandomState(rng_seed)
Note: PCA calculation via sklearn and scanpy give similar results
Load preprocessed data (see notebook Data_Preprocessing)
# load preprocessed data
canogamez = sc.read_h5ad("result_files/canogamez_preprocessing.h5ad") # change to your data path
Calculate PCA
sc.tl.pca(canogamez, svd_solver='arpack', random_state=rng)
computing PCA
on highly variable genes
with n_comps=50
finished (0:00:02)
Plotting
fig = px.scatter(canogamez.obsm["X_pca"], x = 0, y = 1, color = canogamez.obs['cytokine.condition'], title='PCA',
labels = {
"0": "PC1",
"1": "PC2",
})
fig.show()
Load preprocessed data (see notebook Data_Preprocessing)
# load preprocessed data
canogamez = sc.read_h5ad("result_files/canogamez_preprocessing.h5ad") # change to your data path
# save AnnData as CRS matrix
# positions of null values are saved instead of storing all null values itself
# -> faster calculation and less memory usage
canogamez.X = csr_matrix(canogamez.X)
Calculate ICA
# function to calculate ICA
def ica(adata, n_components, inplace = True):
'''Computes the ICA on AnnData'''
from sklearn.decomposition import FastICA
ica_transformer = FastICA(n_components = n_components, random_state = rng)
x_ica = ica_transformer.fit_transform(adata.X.toarray())
if inplace:
adata.obsm['X_ica'] = x_ica
adata.varm['ICs'] = ica_transformer.components_.T
else:
return ica_transformer
ica(canogamez, 40) # calculate 40 components, taken for UMAP calculation based on PCA
Plotting
fig = px.scatter(canogamez.obsm["X_ica"], x = 0, y = 1, color = canogamez.obs['cytokine.condition'],
title = 'ICA',
labels = {
"0": "IC1",
"1": "IC2",
})
fig.show()
Load raw data
canogamez = sc.read_h5ad("/corgi/scdata/henriksson/canogamez/cleaned.h5ad") # change to your data path
# set `zero_center to `False` to avoid negative values
sc.pp.scale(canogamez, zero_center = False)
Calculate NMF
def nmf(adata, n_components, inplace = True):
'''Computes the NFM on AnnData'''
from sklearn.decomposition import NMF
model = NMF(n_components = n_components, init = 'random', random_state = 0, max_iter = 10000)
model = model.fit(adata.X.toarray())
nmf_features = model.transform(adata.X.toarray())
if inplace:
adata.obsm['X_nfm'] = nmf_features
adata.varm['NFMs'] = model.components_.T
else:
return model
nmf(canogamez,16) # calculate 16 components as original AnnData is composed of 16 samples
Plotting
fig = px.scatter(canogamez.obsm["X_nfm"], x = 0, y = 1, color = canogamez.obs['cytokine.condition'],
title = 'NMF',
labels = {
"0": "Component 1",
"1": "Component 2",
})
fig.show()
Load raw data
canogamez = sc.read_h5ad("/corgi/scdata/henriksson/canogamez/cleaned.h5ad") # change to your data path
sc.pp.scale(canogamez, zero_center=False)
canogamez.X = csr_matrix(canogamez.X)
Calculate LDA
def LaDiAl(adata , inplace=True):
'''Computes the Latent Dirichlet Allocation on AnnData'''
from sklearn.decomposition import LatentDirichletAllocation
lda = LatentDirichletAllocation()
model = lda.fit(adata.X.toarray())
if inplace:
adata.obsm['X_lda'] = lda.transform(adata.X)
adata.varm['lda'] = lda.components_.T
else :
return model
LaDiAl(canogamez) # no number of components given, calculated until converging
Plotting
fig = px.scatter(canogamez.obsm["X_lda"], x = 0, y = 1, color = canogamez.obs['cytokine.condition'],
title = 'LDA',
labels = {
"0": "Component 1",
"1": "Component 2",
})
fig.show()
Load raw data
canogamez = sc.read_h5ad("/corgi/scdata/henriksson/canogamez/cleaned.h5ad")
sc.pp.scale(canogamez, zero_center = False)
canogamez.X = csr_matrix(canogamez.X)
Calculate Linear Discriminant Analysis
def LiDiAn(adata, y, inplace = True):
'''Computes the Linear Discriminant Analysis on AnnData'''
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
n_components = int(len(y.categories) - 1)
LDA = LDA(n_components = n_components)
model = LDA.fit(adata.X.toarray(), y)
if inplace:
adata.obsm['X_LDA'] = LDA.transform(adata.X.toarray())
adata.varm['LDAs'] = LDA.coef_.T
else :
return model
Algorithm needs an array y with given target values for each cell. Here the annotation cytokine.condition was used.
cytokine = canogamez.obs['cytokine.condition']
cytokine = pd.Series(pd.Categorical(cytokine))
y = cytokine.array
LiDiAn(canogamez, y)
Plotting
fig = px.scatter(canogamez.obsm["X_LDA"], x = 0, y = 1, color = canogamez.obs['cytokine.condition'],
title = 'Linear Discriminant Analysis',
labels = {
"0": "Component 1",
"1": "Component 2",
})
fig.show()
# calculate PCA
canogamez = sc.read_h5ad("result_files/canogamez_preprocessing.h5ad") # change to your data path
sc.tl.pca(canogamez, svd_solver='arpack', random_state=rng)
computing PCA
on highly variable genes
with n_comps=50
finished (0:00:02)
# Add 'gene_ids' to .var
gene_names = canogamez.var.index.to_list()
canogamez.var['gene_ids'] = gene_names
Example Analysis based on PCA Factor Analysis
Change .varm[] and .obsm[] to the variable names chosen for the different factor analysis calculations to exprore the genes in the components identified by the other methods
# code based on: https://nbisweden.github.io/workshop-scRNAseq/labs/compiled/scanpy/scanpy_02_dim_reduction.html
# plot genes in first 4 PCs
genes = canogamez.var['gene_ids']
for pc in [1,2,3,4]:
g = canogamez.varm['PCs'][:,pc-1]
o = np.argsort(g)
sel = np.concatenate((o[:10],o[-10:])).tolist()
emb = canogamez.obsm['X_pca'][:,pc-1]
tempdata = canogamez[np.argsort(emb),]
sc.pl.heatmap(tempdata, var_names = genes[sel].index.tolist(),
groupby='cell.type', swap_axes = True, use_raw=False)
# get gene names in first component
g = canogamez.varm['PCs'][:,0]
o = np.argsort(g)
sel = np.concatenate((o[:10],o[-10:])).tolist()
genes_pc1 = genes[sel].index.tolist()
import gseapy as gp
from gseapy.plot import dotplot
enr = gp.enrichr(gene_list = genes_pc1,
gene_sets = ['MSigDB_Hallmark_2020'], # alternative sets : https://maayanlab.cloud/Enrichr/#stats
organism = 'Human',
cutoff = 1 # value should be in the range(0,1)
)
dotplot(enr.res2d,cmap='viridis_r', cutoff=1)
<AxesSubplot:xlabel='-log$_{10}$(Adjusted P-value)'>
The by the different factor analysis calculated components can be used for further analysis in scanpy (similar to notebooks 'Exploring_Cell-Types_in_UNS_data.ipynb' and 'Exploring_Cell-Types_in_stimulated_data.ipynb'). For this purpose the calculation has to be stored under the variable name of the PCA, e.g. adata.obsm['X_ica'] as canogamez.obsm['X_pca'] and adata.varm['ICs'] as canogamez.varm['PCs'].